{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentiment Analysis in Korean (using Naver Sentiment Movie Corpus)\n",
    "- Sentiment Analysis with RNN (GRU)\n",
    "- Dataset source: https://github.com/e9t/nsmc\n",
    "- To install & use ```Mecab``` tagger in Windows, refer to https://groups.google.com/forum/#!topic/konlpy/SuMc8EkCT_M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import tensorflow as tf\n",
    "import matplotlib.pyplot as plt\n",
    "from konlpy.tag import Mecab\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.models import *\n",
    "from keras.layers import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. Import & process dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>document</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8112052</td>\n",
       "      <td>어릴때보고 지금다시봐도 재밌어요ㅋㅋ</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8132799</td>\n",
       "      <td>디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4655635</td>\n",
       "      <td>폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>9251303</td>\n",
       "      <td>와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10067386</td>\n",
       "      <td>안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         id                                           document  label\n",
       "0   8112052                                어릴때보고 지금다시봐도 재밌어요ㅋㅋ      1\n",
       "1   8132799  디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...      1\n",
       "2   4655635               폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.      1\n",
       "3   9251303  와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...      1\n",
       "4  10067386                        안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.      1"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_table(\"ratings.txt\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mecab = Mecab(\"C:\\\\mecab\\\\mecab-ko-dic\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['어릴', '때', '보', '고', '지금', '다시', '봐도', '재밌', '어요', 'ㅋㅋ']"
      ]
     },
     "execution_count": 115,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mecab.morphs(df[\"document\"][0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of Reviews:  199992 199992\n",
      "Number of tokens:  3669567\n",
      "Number of unique tokens:  61039\n",
      "Wall time: 24.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "reviews = []\n",
    "labels = []\n",
    "all_tokens = []\n",
    "unique_tokens = dict()\n",
    "\n",
    "for i in range(len(df)):\n",
    "    try:\n",
    "        tokens = mecab.morphs(df[\"document\"][i])\n",
    "        reviews.append(tokens)\n",
    "        labels.append(df[\"label\"][i])\n",
    "        \n",
    "        all_tokens += tokens\n",
    "        for t in tokens:\n",
    "            if t in unique_tokens.keys():\n",
    "                unique_tokens[t] += 1\n",
    "            else:\n",
    "                unique_tokens[t] = 1\n",
    "    except:\n",
    "        pass\n",
    "    \n",
    "print(\"Number of Reviews: \", len(reviews), len(labels))\n",
    "print(\"Number of tokens: \", len(all_tokens))\n",
    "print(\"Number of unique tokens: \", len(unique_tokens))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def create_dictionary(unique_tokens, threshold):\n",
    "    token_to_idx = dict()\n",
    "    idx_to_token = dict()\n",
    "    unique_token_keys = list(unique_tokens.keys())\n",
    "    \n",
    "    j = 0\n",
    "    for i in range(len(unique_token_keys)):\n",
    "        if unique_tokens[unique_token_keys[i]] > threshold:\n",
    "            token_to_idx[unique_token_keys[i]] = j\n",
    "            idx_to_token[j] = unique_token_keys[i]\n",
    "            j += 1\n",
    "    \n",
    "    return token_to_idx, idx_to_token"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2437 2437\n"
     ]
    }
   ],
   "source": [
    "token_to_idx, idx_to_token = create_dictionary(unique_tokens, 100)\n",
    "\n",
    "print(len(token_to_idx), len(idx_to_token))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 2.12 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "for i in range(len(reviews)):\n",
    "    for j in range(len(reviews[i])):\n",
    "        if reviews[i][j] in token_to_idx.keys():\n",
    "            reviews[i][j] = token_to_idx[reviews[i][j]]\n",
    "        else:\n",
    "            reviews[i][j] = None\n",
    "            \n",
    "    reviews[i] = [x for x in reviews[i] if x != None]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of non-empty reviews:  198701\n"
     ]
    }
   ],
   "source": [
    "i = 0\n",
    "for r in reviews:\n",
    "    if len(r)!=0:\n",
    "        i += 1\n",
    "        \n",
    "print(\"Number of non-empty reviews: \", i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "198701 198701\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(reviews)):\n",
    "    if len(reviews[i]) == 0:\n",
    "        labels[i] = None\n",
    "\n",
    "reviews = [x for x in reviews if len(x) != 0]\n",
    "labels = [x for x in labels if x != None]\n",
    "\n",
    "print(len(reviews), len(labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "198701 198701\n"
     ]
    }
   ],
   "source": [
    "X_data = pad_sequences(reviews, maxlen = 30)\n",
    "y_data = np.asarray(labels)\n",
    "\n",
    "print(len(X_data), len(y_data))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Create model\n",
    "- Model with two GRU layers (with 50 cells each)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(158960, 30) (39741, 30) (158960,) (39741,)\n"
     ]
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(np.asarray(X_data), np.asarray(y_data),test_size = 0.2)\n",
    "\n",
    "print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def simple_rnn_model(gpu = False):\n",
    "    model = Sequential()\n",
    "    model.add(Embedding(len(token_to_idx), 50, input_length = X_train.shape[1]))\n",
    "    if gpu:\n",
    "        model.add(CuDNNGRU(50, return_sequences = True))\n",
    "        model.add(CuDNNGRU(50))\n",
    "    else:\n",
    "        model.add(GRU(50, return_sequences = True))\n",
    "        model.add(GRU(50))\n",
    "    model.add(Dense(50, activation = \"relu\"))\n",
    "    model.add(Dense(1, activation = \"sigmoid\"))\n",
    "    model.compile(loss = \"binary_crossentropy\", optimizer = \"adam\", metrics = [\"acc\"])\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "embedding_3 (Embedding)      (None, 30, 50)            121850    \n",
      "_________________________________________________________________\n",
      "cu_dnngru_23 (CuDNNGRU)      (None, 30, 50)            15300     \n",
      "_________________________________________________________________\n",
      "cu_dnngru_24 (CuDNNGRU)      (None, 50)                15300     \n",
      "_________________________________________________________________\n",
      "dense_37 (Dense)             (None, 50)                2550      \n",
      "_________________________________________________________________\n",
      "dense_38 (Dense)             (None, 1)                 51        \n",
      "=================================================================\n",
      "Total params: 155,051\n",
      "Trainable params: 155,051\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "# If you do not have GPU, set gpu parameter as False!\n",
    "model = simple_rnn_model(True)\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Model Training & Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 143064 samples, validate on 15896 samples\n",
      "Epoch 1/10\n",
      "143064/143064 [==============================] - 4s - loss: 0.4575 - acc: 0.7799 - val_loss: 0.3951 - val_acc: 0.8178\n",
      "Epoch 2/10\n",
      "143064/143064 [==============================] - 2s - loss: 0.3675 - acc: 0.8379 - val_loss: 0.3673 - val_acc: 0.8369\n",
      "Epoch 3/10\n",
      "143064/143064 [==============================] - 2s - loss: 0.3577 - acc: 0.8430 - val_loss: 0.3562 - val_acc: 0.8432\n",
      "Epoch 4/10\n",
      "143064/143064 [==============================] - 2s - loss: 0.3474 - acc: 0.8483 - val_loss: 0.3480 - val_acc: 0.8459\n",
      "Epoch 5/10\n",
      "143064/143064 [==============================] - 2s - loss: 0.3341 - acc: 0.8546 - val_loss: 0.3447 - val_acc: 0.8476\n",
      "Epoch 6/10\n",
      "143064/143064 [==============================] - 2s - loss: 0.3230 - acc: 0.8592 - val_loss: 0.3326 - val_acc: 0.8529\n",
      "Epoch 7/10\n",
      "143064/143064 [==============================] - 2s - loss: 0.3122 - acc: 0.8647 - val_loss: 0.3285 - val_acc: 0.8529\n",
      "Epoch 8/10\n",
      "143064/143064 [==============================] - 2s - loss: 0.3042 - acc: 0.8676 - val_loss: 0.3252 - val_acc: 0.8568\n",
      "Epoch 9/10\n",
      "143064/143064 [==============================] - 2s - loss: 0.2969 - acc: 0.8714 - val_loss: 0.3260 - val_acc: 0.8568\n",
      "Epoch 10/10\n",
      "143064/143064 [==============================] - 2s - loss: 0.2918 - acc: 0.8735 - val_loss: 0.3267 - val_acc: 0.8574\n"
     ]
    }
   ],
   "source": [
    "hist = model.fit(X_train, y_train, validation_split = 0.1, epochs = 10, batch_size = 1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "39456/39741 [============================>.] - ETA: 0sTest Accuracy:  0.85488538286\n"
     ]
    }
   ],
   "source": [
    "print(\"Test Accuracy: \", model.evaluate(X_test, y_test)[1])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
